library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(insuranceData)
library(plotly)
## Warning: Paket 'plotly' wurde unter R Version 4.2.2 erstellt
##
## Attache Paket: 'plotly'
##
## Das folgende Objekt ist maskiert 'package:ggplot2':
##
## last_plot
##
## Das folgende Objekt ist maskiert 'package:stats':
##
## filter
##
## Das folgende Objekt ist maskiert 'package:graphics':
##
## layout
library(scales)
##
## Attache Paket: 'scales'
##
## Das folgende Objekt ist maskiert 'package:purrr':
##
## discard
##
## Das folgende Objekt ist maskiert 'package:readr':
##
## col_factor
data(dataCar)
a. Draw barplots of the discrete variables "numclaims", "agecat" (categorized driver age), and "gender".
ggplot(data = dataCar, mapping = aes(x = numclaims)) +
geom_bar(fill = "#83b692")
ggplot(data = dataCar, mapping = aes(x = agecat)) +
geom_bar(fill = "#F9ADA0")
ggplot(data = dataCar, mapping = aes(x = gender)) +
geom_bar(fill = "#5B3758")
b. Draw a histogram of the vehicle value "veh_value" (in 10'000 Australian Dollars). Truncate values above 7 (this means: if a value is larger than 7, set it to 7).
# use pmin instead of rowwise + min (crazy inefficient)
d <- dataCar %>%
rowwise %>%
mutate(trunc_veh_value = min(veh_value, 7))
p <- ggplot(d, aes(x = trunc_veh_value)) +
geom_histogram(fill = "#83b692", bins = 30) +
ggtitle("Histogram of vehicle value") +
xlab("Price in 10'000 AUD")
p
c. Calculate the average number of claims per level of "agecat" and visualize the result as a scatterplot. Interpret the result.
d <- dataCar %>%
group_by(agecat) %>%
summarise(avg_claims = mean(numclaims))
p <- ggplot(d, mapping = aes(x = agecat, y = avg_claims)) +
geom_point(color = "black") +
xlab("Age category, 1=youngest") +
ggtitle("Scatterplot")
p
Interpretation: The higher the age category, the lower the average
number of claims. Especially, age categories 5 and 6 have a lower
average than the other age categories and age category has a higher
average than the others.
d. Bin "veh_value" into quartiles and analyze its association with the number of claims as in 1c.
d <- dataCar %>%
mutate(quartile_rank = ntile(veh_value,4)) %>%
group_by(quartile_rank) %>%
summarise(avg_claims = mean(numclaims))
p <- ggplot(d, mapping = aes(x = quartile_rank, y = avg_claims)) +
geom_point(color = "black") +
xlab("Quartile rank, 1=0.25 quartile") +
ggtitle("Scatterplot")
p
Interpretation: The higher the value of the vehicle, the higher the
average number of claims.
e. Use the "plotly" package to turn the plot from d. interactive.
p %>%
ggplotly()
The sieve of Eratosthenes is an ancient algorithm to get all prime
numbers up to any given limit \(n\),
see Wikipedia.
Write a function sieve_of_eratosthenes(n) that returns all
prime numbers up to \(n\). Benchmark
the results for \(n = 10^5\) with the
package “microbenchmark”. Mind your coding style!
Naive approach:
sieve_of_eratosthenes <- function(n) {
df <- data.frame(seq(2,n))
colnames(df) <- c("x")
p <- 2
i <- 1
done <- FALSE
while(!done) {
df <- df %>% filter(x %% p != 0 | x == p)
i <- i + 1
if (i > nrow(df)) {
done = TRUE
} else {
p <- df$x[i]
}
}
df
}
head(sieve_of_eratosthenes(1000))
## x
## 1 2
## 2 3
## 3 5
## 4 7
## 5 11
## 6 13
sieve_of_eratosthenes_opt <- function(n) {
v <- seq(2,n)
p <- 2
i <- 1
done <- FALSE
while(!done) {
v <- v[v %% p != 0 | v == p]
i <- i + 1
if (i > length(v)) {
done = TRUE
} else {
p <- v[i]
}
}
v
}
head(sieve_of_eratosthenes_opt(1000))
## [1] 2 3 5 7 11 13
Comparison
library(microbenchmark)
## Warning: Paket 'microbenchmark' wurde unter R Version 4.2.2 erstellt
n = 10000
microbenchmark(
sieve_of_eratosthenes(n),
sieve_of_eratosthenes_opt(n),
times = 2
)
## Unit: milliseconds
## expr min lq mean median uq
## sieve_of_eratosthenes(n) 8626.1765 8626.1765 9027.6796 9027.6796 9429.1826
## sieve_of_eratosthenes_opt(n) 36.2805 36.2805 38.2185 38.2185 40.1565
## max neval cld
## 9429.1826 2 a
## 40.1565 2 b
In Exercise 1c, we have calculated and plotted the average number of
claims per level of “agecat” in the dataCar data.
a. Write a function `avg_claim_counts(v)` that provides such a visualization for any discrete variable `v`.
Source: https://shixiangwang.github.io/tidyeval-chinese/dplyr.html
avg_claim_counts <- function(v) {
v <- enquo(v)
d <- dataCar %>%
group_by(!!v) %>%
summarise(avg_claims = mean(numclaims))
p <- ggplot(d, mapping = aes(x = !!v, y = avg_claims)) +
geom_point(color = "black") +
xlab(v) +
ggtitle("Scatterplot")
p
}
avg_claim_counts(agecat)
avg_claim_counts(gender)
avg_claim_counts(area)
b. Extend this function with a second argument `interactive` to control whether the resulting plot is interactive or not.
library(plotly)
avg_claim_counts_2 <- function(v, interactive = FALSE) {
v <- enquo(v)
d <- dataCar %>%
group_by(!!v) %>%
summarise(avg_claims = mean(numclaims))
p <- ggplot(d, mapping = aes(x = !!v, y = avg_claims)) +
geom_point(color = "black") +
ggtitle("Scatterplot")
if (interactive) {
p <- p %>%
ggplotly()
}
p
}
avg_claim_counts_2(agecat, TRUE)
avg_claim_counts_2(gender, TRUE)
avg_claim_counts_2(area, TRUE)
Extend the “student” class from Section “plot, print, summary” by the
optional information “semester”. It represents the number of semesters
the student is already registered. Add a summary() method
that would neatly print the name and the semester of the student.
# Function that creates an object of class "student"
student <- function(given_name, family_name, semester = NULL) {
out <- list(
given_name = given_name,
family_name = family_name,
semester = semester
)
class(out) <- "student"
out
}
me <- student("Roland", "Widmer")
me2 <- student("Roland", "Widmer", 11)
summary.student <- function(x, ...) {
cat(x$given_name, x$family_name, ifelse(x$semester, paste(", Semester:", x$semester), ""))
invisible(x) # do not output x itself
}
summary(me)
## Roland Widmer
summary(me2)
## Roland Widmer , Semester: 11